Replicating pipeline and results described in Galbraith et al. (2016). PNAS. https://www.pnas.org/content/113/4/1020



Overview of pipeline



Requirements


if(suppressWarnings(require("purrr"))==F){
  install.packages("purrr")
  library(purrr)
}

if(suppressWarnings(require("tidyverse"))==F){
  install.packages("tidyverse")
  library(tidyverse)
}

if(suppressWarnings(require("plyr"))==F){
  install.packages("plyr")
  library(plyr)
}

if(suppressWarnings(require("nlme"))==F){
  install.packages("nlme")
  library(nlme)
}

if(suppressWarnings(require("viridis"))==F){
  install.packages("viridis")
  library(viridis)
}

if(suppressWarnings(require("gridExtra"))==F){
  install.packages("gridExtra")
  library(gridExtra)
}

if(suppressWarnings(require("kableExtra"))==F){
  install.packages("kableExtra")
  library(kableExtra)
}


Setup


Sample metadata

metadata <- read.csv("metadata.csv")
kbl(metadata) %>% kable_styling()
sample.id parent cross phenotype individual lineage
875Q_SRR3033262 875Q A reproductive SRR3033262 EHB
875Q_SRR3033263 875Q A reproductive SRR3033263 EHB
875Q_SRR3033264 875Q A reproductive SRR3033264 EHB
875Q_SRR3033249 875Q A sterile SRR3033249 EHB
875Q_SRR3033250 875Q A sterile SRR3033250 EHB
875Q_SRR3033251 875Q A sterile SRR3033251 EHB
875D_SRR3033262 875D A reproductive SRR3033262 AHB
875D_SRR3033263 875D A reproductive SRR3033263 AHB
875D_SRR3033264 875D A reproductive SRR3033264 AHB
875D_SRR3033249 875D A sterile SRR3033249 AHB
875D_SRR3033250 875D A sterile SRR3033250 AHB
875D_SRR3033251 875D A sterile SRR3033251 AHB
888Q_SRR3033268 888Q A reproductive SRR3033268 AHB
888Q_SRR3033269 888Q A reproductive SRR3033269 AHB
888Q_SRR3033270 888Q A reproductive SRR3033270 AHB
888Q_SRR3033256 888Q A sterile SRR3033256 AHB
888Q_SRR3033257 888Q A sterile SRR3033257 AHB
888Q_SRR3033258 888Q A sterile SRR3033258 AHB
888D_SRR3033268 888D A reproductive SRR3033268 EHB
888D_SRR3033269 888D A reproductive SRR3033269 EHB
888D_SRR3033270 888D A reproductive SRR3033270 EHB
888D_SRR3033256 888D A sterile SRR3033256 EHB
888D_SRR3033257 888D A sterile SRR3033257 EHB
888D_SRR3033258 888D A sterile SRR3033258 EHB
882Q_SRR3033265 882Q B reproductive SRR3033258 EHB
882Q_SRR3033266 882Q B reproductive SRR3033259 EHB
882Q_SRR3033267 882Q B reproductive SRR3033260 EHB
882Q_SRR3033252 882Q B sterile SRR3033252 EHB
882Q_SRR3033253 882Q B sterile SRR3033253 EHB
882Q_SRR3033254 882Q B sterile SRR3033254 EHB
882Q_SRR3033255 882Q B sterile SRR3033255 EHB
882D_SRR3033265 882D B reproductive SRR3033258 AHB
882D_SRR3033266 882D B reproductive SRR3033259 AHB
882D_SRR3033267 882D B reproductive SRR3033260 AHB
882D_SRR3033252 882D B sterile SRR3033252 AHB
882D_SRR3033253 882D B sterile SRR3033253 AHB
882D_SRR3033254 882D B sterile SRR3033254 AHB
882D_SRR3033255 882D B sterile SRR3033255 AHB
894Q_SRR3033271 894Q B reproductive SRR3033271 AHB
894Q_SRR3033272 894Q B reproductive SRR3033272 AHB
894Q_SRR3033273 894Q B reproductive SRR3033273 AHB
894Q_SRR3033259 894Q B sterile SRR3033259 AHB
894Q_SRR3033260 894Q B sterile SRR3033260 AHB
894Q_SRR3033261 894Q B sterile SRR3033261 AHB
894D_SRR3033271 894D B reproductive SRR3033271 EHB
894D_SRR3033272 894D B reproductive SRR3033272 EHB
894D_SRR3033273 894D B reproductive SRR3033273 EHB
894D_SRR3033259 894D B sterile SRR3033259 EHB
894D_SRR3033260 894D B sterile SRR3033260 EHB
894D_SRR3033261 894D B sterile SRR3033261 EHB

Generate SNP:gene count matrices


Generate more convenient TSV from SNP:gene bed file

  1. Load SNPs_for_analysis.bed, keep columns 1 (chromosome), 2 (position), and 4 (SNP:gene).
  2. Split column 3 by “:”, create new columns 4 (SNP ID) and 5 (gene ID).
  3. Save as TSV.
SNPs <- read.table("sNPs_for_analysis_sorted.bed",sep="\t",header=F)[,c(1,2,4)]
names(SNPs) <- c("chr","pos","SNP_gene")
SNPs$SNP <- as.character(map(strsplit(SNPs$SNP_gene, split = ":"), 1))
SNPs$geneID <- as.character(map(strsplit(SNPs$SNP_gene, split = ":"), 2))
write.table(SNPs,"SNP_gene_overlaps.txt",sep="\t",quote=F,row.names=F)
kbl(head(SNPs)) %>% kable_styling()
SNPs <- read.table("SNP_gene_overlaps.txt",sep="\t",header=1)

Set up count matrix

  1. Create empty data frame SNP_counts with as many rows as SNP:genes.
  2. Create a list of all *.txt files in the counts_SNPs directory.
SNP_counts <- data.frame(matrix(ncol=0,nrow=length(SNPs$SNP_gene)))
SNP_counts$SNP_gene <- SNPs$SNP_gene

files.counts <- list.files(path="counts_tophat2", 
                           pattern="*.txt", 
                           full.names=TRUE, 
                           recursive=FALSE)
head(files.counts)
  1. For each file listed in files.counts:
  • Read in the file to tmp, keep columns 4 (SNP:gene) and 7 (counts).
  • Store the sample ID (lineParent_SRA) from the file name to tmp.name.
  • Set column names of tmp to column 1 (SNP:gene) and column 2 (tmp.name).
  • Left join tmp to SNP_counts.
  1. Set the row names of SNP_counts to the SNP:gene IDs stored in column 1 (SNP:gene).
  2. Remove column 1 (SNP:gene).
  3. Fill empty cells with 0.
  4. Save as CSV.
for(i in 1:length(files.counts)){
  tmp <- read.table(files.counts[[i]],header=F)[,c(4,7)]
  tmp.name <-  strsplit(strsplit(files.counts[[i]], 
                                 split = "/")[[1]][2],
                        split = "[.]")[[1]][1]
  names(tmp) <- c("SNP_gene",tmp.name)
  SNP_counts <- SNP_counts %>% 
    left_join(tmp, by = c('SNP_gene' = 'SNP_gene')) 
}

row.names(SNP_counts) <- SNP_counts$SNP_gene
SNP_counts$SNP_gene <- NULL

SNP_counts[is.na(SNP_counts)] <- 0

write.csv(SNP_counts,"SNP_gene_counts.csv")

kbl(head(SNP_counts[,c(1:5)])) %>% kable_styling()
SNP_counts <- read.csv("SNP_gene_counts.csv",row.names=1)
names(SNP_counts) <- gsub("X","",names(SNP_counts))


Preprocess F1 count matrices


Split count matrix by phenotype

Subset SNP_counts by sterile and reproductive phenotypes: split by column names in metadata under sample.id by phenotype.

sterile.IDs <- metadata[metadata$phenotype=="sterile","sample.id"]
reproductive.IDs <- metadata[metadata$phenotype=="reproductive","sample.id"]

sterile_counts <- SNP_counts[,names(SNP_counts)%in%sterile.IDs]
reproductive_counts <- SNP_counts[,names(SNP_counts)%in%reproductive.IDs]

Filter low-count SNPs

Pre-filtered SNPs
length(row.names(SNP_counts))
## [1] 59991

Remove rows where any column has <=lcf reads by cross.

lcf <- 1

sterile_counts <- sterile_counts[rowSums(sterile_counts)>lcf*(length(names(sterile_counts))/2),]
reproductive_counts <- reproductive_counts[rowSums(reproductive_counts)>lcf*(length(names(reproductive_counts))/2),]

l875.s <- metadata[metadata$parent%in%c("875D","875Q")&metadata$phenotype=="sterile","sample.id"]
l888.s <- metadata[metadata$parent%in%c("888D","888Q")&metadata$phenotype=="sterile","sample.id"]
l882.s <- metadata[metadata$parent%in%c("882D","882Q")&metadata$phenotype=="sterile","sample.id"]
l894.s <- metadata[metadata$parent%in%c("894D","894Q")&metadata$phenotype=="sterile","sample.id"]

sterile_counts <- sterile_counts[rowSums(sterile_counts[,l875.s])>lcf,]
sterile_counts <- sterile_counts[rowSums(sterile_counts[,l888.s])>lcf,]
sterile_counts <- sterile_counts[rowSums(sterile_counts[,l882.s])>lcf,]
sterile_counts <- sterile_counts[rowSums(sterile_counts[,l894.s])>lcf,]

l875.r <- metadata[metadata$parent%in%c("875D","875Q")&metadata$phenotype=="reproductive","sample.id"]
l888.r <- metadata[metadata$parent%in%c("888D","888Q")&metadata$phenotype=="reproductive","sample.id"]
l882.r <- metadata[metadata$parent%in%c("882D","882Q")&metadata$phenotype=="reproductive","sample.id"]
l894.r <- metadata[metadata$parent%in%c("894D","894Q")&metadata$phenotype=="reproductive","sample.id"]

reproductive_counts <- reproductive_counts[rowSums(reproductive_counts[,l875.r])>lcf,]
reproductive_counts <- reproductive_counts[rowSums(reproductive_counts[,l888.r])>lcf,]
reproductive_counts <- reproductive_counts[rowSums(reproductive_counts[,l882.r])>lcf,]
reproductive_counts <- reproductive_counts[rowSums(reproductive_counts[,l894.r])>lcf,]
Reproductive SNPs
length(row.names(reproductive_counts))
## [1] 36430
Sterile SNPs
length(row.names(sterile_counts))
## [1] 33008

Split count matrices by lineage and parent of origin

p pat/mat lineage
p1 pat AHB
p1 mat EHB
p2 pat EHB
p2 mat AHB
# Sterile
## p1i
p1.sterile.pat <- sterile_counts[,metadata[metadata$parent%in%c("875D","882D")&
                                             metadata$phenotype=="sterile","sample.id"]]
p1.sterile.mat <- sterile_counts[,metadata[metadata$parent%in%c("875Q","882Q")&
                                             metadata$phenotype=="sterile","sample.id"]]
names(p1.sterile.pat) <- names(p1.sterile.mat)
## p2i
p2.sterile.pat <- sterile_counts[,metadata[metadata$parent%in%c("888D","894D")&
                                             metadata$phenotype=="sterile","sample.id"]]
p2.sterile.mat <- sterile_counts[,metadata[metadata$parent%in%c("888Q","894Q")&
                                             metadata$phenotype=="sterile","sample.id"]]
names(p2.sterile.pat) <- names(p2.sterile.mat)

# Reproductive
## p1i
p1.reproductive.pat <- reproductive_counts[,metadata[metadata$parent%in%c("875D","882D")&
                                                       metadata$phenotype=="reproductive","sample.id"]]
p1.reproductive.mat <- reproductive_counts[,metadata[metadata$parent%in%c("875Q","882Q")&
                                                       metadata$phenotype=="reproductive","sample.id"]]
names(p1.reproductive.pat) <- names(p1.reproductive.mat)
## p2i
p2.reproductive.pat <- reproductive_counts[,metadata[metadata$parent%in%c("888D","894D")&
                                                       metadata$phenotype=="reproductive","sample.id"]]
p2.reproductive.mat <- reproductive_counts[,metadata[metadata$parent%in%c("888Q","894Q")&
                                                       metadata$phenotype=="reproductive","sample.id"]]
names(p2.reproductive.pat) <- names(p2.reproductive.mat)


Functions for statistical tests


Storer-Kim test [twobinom]

(Function from the WRS2 package). Test the hypothesis that two independent binomials have equal probability of success using the Storer–Kim method.

twobinom<-function(r1=sum(elimna(x)),n1=length(x),
                   r2=sum(elimna(y)),n2=length(y),
                   x=NA,y=NA,alpha=.05){
  # r1=number of successes in group 1
  # r2=number of successes in group 2
  # n1=number of observations in group 1
  # n2=number of observations in group 2
  n1p<-n1+1
  n2p<-n2+1
  n1m<-n1-1
  n2m<-n2-1
  chk<-abs(r1/n1-r2/n2)
  x<-c(0:n1)/n1
  y<-c(0:n2)/n2
  phat<-(r1+r2)/(n1+n2)
  m1<-outer(x,y,"-")
  m2<-matrix(1,n1p,n2p)
  flag<-(abs(m1)>=chk)
  m3<-m2*flag
  b1<-1
  b2<-1
  xv<-c(1:n1)
  yv<-c(1:n2)
  xv1<-n1-xv+1
  yv1<-n2-yv+1
  dis1<-c(1,pbeta(phat,xv,xv1))
  dis2<-c(1,pbeta(phat,yv,yv1))
  pd1<-NA
  pd2<-NA
  for(i in 1:n1)pd1[i]<-dis1[i]-dis1[i+1]
  for(i in 1:n2)pd2[i]<-dis2[i]-dis2[i+1]
  pd1[n1p]<-phat^n1
  pd2[n2p]<-phat^n2
  m4<-outer(pd1,pd2,"*")
  test<-sum(m3*m4)
  list(p.value=test,p1=r1/n1,p2=r2/n2,est.dif=r1/n1-r2/n2)
}

Wrapper for Storer-Kim test [test.df]

  1. Calculate %p1 and %p2 for each SNP.
  • %p1 = sum(AHB in EHBxAHB)/(AHB in EHBxAHB + EHB in EHBxAHB).
  • %p2 = sum(AHB in AHBxEHB)/(AHB in AHBxEHB + EHB in AHBxEHB).
  1. If either %p1 or %p2 is NA (due to divide by 0), set to 0.
  2. Compute Storer-Kim test p-value for each SNP.
  3. If test failed, set p-value to 1.
  4. For each possible parent and lineage bias, if %p1 and %p2 pass defined thresholds [high and low], set the associated test p-value to the Storer-Kim test p-value; otherwise, set to 1.
  5. Return a data.frame where each row corresponds to one SNP and the associated parent and lineage bias p-values.
test.df <- function(p1.pat.df,p1.mat.df,
                    p2.pat.df,p2.mat.df,high,low,
                    stest=c("CS","SK","thresholds")){
  return.list <- list()
  i.len <- length(row.names(p1.pat.df))
  for(i in 1:i.len){
    p1.pat <- p1.pat.df[i,]
    p1.mat <- p1.mat.df[i,]
    p2.pat <- p2.pat.df[i,]
    p2.mat <- p2.mat.df[i,]
    p1.s <- sum(p1.pat)
    p1.o <- sum(p1.pat,p1.mat)
    p1 <- p1.s/p1.o
    if(is.na(p1)){p1 <- 0}
    p2.s <- sum(p2.mat)
    p2.o <- sum(p2.pat,p2.mat)
    p2 <- p2.s/p2.o
    if(is.na(p2)){p2 <- 0}
    
    if(stest=="SK"){
      test <- twobinom(r1=p1.s,n1=p1.o,r2=p2.s,n2=p2.o)$p.value
    }
    
    if(stest=="CS"){
      test <- chisq.test(data.frame(X=c(p1.s,p2.s),Y=c(p1.o,p2.o)))$p.value
    }
    
    if(stest=="thresholds"){
      test <- 0
    }
    
    if(is.nan(test)){test <- 1}
    if(p1>high&p2<low){pat.test <- test}else{pat.test <- 1}
    if(p1<low&p2>high){mat.test <- test}else{mat.test <- 1}
    if(p1<low&p2<low){EHB.test <- test}else{EHB.test <- 1}
    if(p1>high&p2>high){AHB.test <- test}else{AHB.test <- 1}
    
    return.df <- data.frame(SNP_gene=row.names(p1.pat.df[i,]),
                            pat.p=pat.test,mat.p=mat.test,
                            AHB.p=AHB.test,EHB.p=EHB.test)
    
    return.list[[i]] <- return.df
  }
  return(bind_rows(return.list))
}

General linear mixed effects model [GLIMMIX]

GLIMMIX <- function(counts,metadata){
  counts$SNP_gene <- row.names(counts)
  parent.list <- list()
  parent.p.list <- list()
  lineage.list <- list()
  lineage.p.list <- list()
  counts$geneID <- as.character(unlist(map(strsplit(counts$SNP_gene, split = ":"), 2)))
  genelist <- unique(counts$geneID)
  for(i in 1:length(genelist)){
    counts.sub <- counts[counts$geneID==genelist[i],]
    counts.sub$geneID <- NULL
    counts.sub <- gather(counts.sub, sample.id, count, 
                         names(counts.sub), -SNP_gene, factor_key=TRUE)
    counts.sub <- join(counts.sub, metadata, by = "sample.id")[,-c(5:6)]
    counts.sub$parent <- substr(counts.sub$parent,4,4)
    counts.sub$parent <- as.factor(counts.sub$parent)
    counts.sub$SNP_gene <- as.factor(counts.sub$SNP_gene)
    counts.sub$lineage <- as.factor(counts.sub$lineage)
    counts.sub$individual <- as.factor(counts.sub$individual)
    
    test <- summary(lme(count~parent+lineage+parent*lineage,
                        random=list(SNP_gene=~1, individual=~1),
                        data=counts.sub))
    
    parent.list[i] <- test$tTable[2,"Value"]
    parent.p.list[i] <- test$tTable[2,"p-value"]
    lineage.list[i] <- test$tTable[3,"Value"]
    lineage.p.list[i] <- test$tTable[3,"p-value"]
  }
  return(data.frame(ID=genelist,
                    parent.dirpat.est=unlist(parent.list),
                    parent.dirpat.p=unlist(parent.p.list),
                    lineageEHB.est=unlist(lineage.list),
                    lineageEHB.p=unlist(lineage.p.list)))
}


Conduct statistical tests


sterile.SK <- test.df(p1.sterile.pat,
                      p1.sterile.mat,
                      p2.sterile.pat,
                      p2.sterile.mat,
                      low=0.4,high=0.6,"CS")
write.csv(sterile.SK,"sterileCS.csv", row.names=F)

# sterile.GLIMMIX <- GLIMMIX(sterile_counts,metadata)

reproductive.SK <- test.df(p1.reproductive.pat,
                           p1.reproductive.mat,
                           p2.reproductive.pat,
                           p2.reproductive.mat,
                           low=0.4,high=0.6,"CS")
write.csv(reproductive.SK,"reproductiveCS.csv", row.names=F)

# reproductive.GLIMMIX <- GLIMMIX(reproductive_counts,metadata)


Plot results


Description under construction.

Sterile F1s


Set up a data.frame to plot %p1 and %p2 for each SNP

p1.sterile.plot <- data.frame(rowSums(p1.sterile.pat)/
                                (rowSums(p1.sterile.mat)+
                                 rowSums(p1.sterile.pat)))
names(p1.sterile.plot) <- c("p1")

p2.sterile.plot <- data.frame(rowSums(p2.sterile.mat)/
                                (rowSums(p2.sterile.mat)+
                                 rowSums(p2.sterile.pat)))
names(p2.sterile.plot) <- c("p2")

Join results of Storer-Kim tests

sterile.plot <- cbind(p1.sterile.plot,p2.sterile.plot)
sterile.plot <- sterile.plot[row.names(sterile.plot)%in%sterile.SK$SNP_gene,]
sterile.plot <- cbind(sterile.plot,sterile.SK[,c(2:5)])
sterile.plot$SNP_gene <- row.names(sterile.plot)
sterile.plot$gene <- as.character(map(strsplit(sterile.plot$SNP_gene, split = ":"), 2))

For each gene, check whether all SNPs are biased in the same direction

sterile.plot$bias <- "NA"
sterile.plot[sterile.plot$pat.p<0.05,"bias"] <- "pat"
sterile.plot[sterile.plot$mat.p<0.05,"bias"] <- "mat"
sterile.plot[sterile.plot$EHB.p<0.05,"bias"] <- "EHB"
sterile.plot[sterile.plot$AHB.p<0.05,"bias"] <- "AHB"
biaslist <- data.frame(matrix(ncol=2,nrow=0))
names(biaslist) <- c("gene","bias")
genelist <- unique(sterile.plot$gene)
for(i in 1:length(genelist)){
  tmp <- unique(sterile.plot[sterile.plot$gene==genelist[i],"bias"])
  if(length(tmp)>1){
    if(length(tmp)==2){
      if(any(tmp%in%"NA")){
        bias <- tmp[!tmp%in%"NA"]
      }
    }else{
      bias <- "NA"
    }
  }else{bias <- tmp}
  biaslist <- rbind(biaslist,data.frame(gene=genelist[[i]], bias=bias))
}
sterile.plot <- sterile.plot %>% 
  left_join(biaslist, by = c('gene' = 'gene')) 
names(sterile.plot) <- c("p1","p2","pat.p","mat.p",
                         "AHB.p","EHB.p","SNP_gene","gene","xbias","bias")

bias.plot <- list()
for(i in 1:length(row.names(sterile.plot))){
  if(sterile.plot[i,"xbias"]==sterile.plot[i,"bias"]){
    bias.plot[i] <- sterile.plot[i,"xbias"]
  }else{
    bias.plot[i] <- "NA"
  }
}
sterile.plot$bias.plot <- unlist(bias.plot)
sterile.plot <- rbind(sterile.plot[sterile.plot$bias.plot%in%c("NA"),],
                      sterile.plot[sterile.plot$bias.plot%in%c("mat", "AHB", "EHB", "pat"),])
sterile.plot$bias.plot <- factor(sterile.plot$bias.plot,
                                 levels = c("NA","mat", "AHB", "EHB", "pat"))

Generate plot for sterile F1s

g1 <- ggplot(sterile.plot, aes(x=p1, y=p2,
                               color=bias.plot,alpha=0.8)) + 
  geom_point() + theme_classic() +
  xlab(expression(paste("% A allele in ",E[mother],
                        " x ",A[father],sep=""))) +
  ylab(expression(paste("% A allele in ",A[mother],
                        " x ",E[father],sep=""))) +
  ggtitle("sterile workers") +
  theme(text = element_text(size=20),
        plot.title = element_text(hjust = 0.5)) +
  scale_color_manual(breaks = levels(sterile.plot$bias)[-c(1)],
                     values=c("grey", magma(20)[c(2)], 
                              magma(20)[c(8)], magma(20)[c(12)], 
                              magma(20)[c(16)]))+
  guides(alpha=F, color=F)

g1

Reproductive F1s


Set up a data.frame to plot %p1 and %p2 for each SNP

p1.reproductive.plot <- data.frame(rowSums(p1.reproductive.pat)/
                                (rowSums(p1.reproductive.mat)+
                                 rowSums(p1.reproductive.pat)))
names(p1.reproductive.plot) <- c("p1")

p2.reproductive.plot <- data.frame(rowSums(p2.reproductive.mat)/
                                (rowSums(p2.reproductive.mat)+
                                 rowSums(p2.reproductive.pat)))
names(p2.reproductive.plot) <- c("p2")

Join results of Storer-Kim tests

reproductive.plot <- cbind(p1.reproductive.plot,p2.reproductive.plot)
reproductive.plot <- reproductive.plot[row.names(reproductive.plot)%in%reproductive.SK$SNP_gene,]
reproductive.plot <- cbind(reproductive.plot,reproductive.SK[,c(2:5)])
reproductive.plot$SNP_gene <- row.names(reproductive.plot)
reproductive.plot$gene <- as.character(map(strsplit(reproductive.plot$SNP_gene, split = ":"), 2))

For each gene, check whether all SNPs are biased in the same direction

reproductive.plot$bias <- "NA"
reproductive.plot[reproductive.plot$pat.p<0.05,"bias"] <- "pat"
reproductive.plot[reproductive.plot$mat.p<0.05,"bias"] <- "mat"
reproductive.plot[reproductive.plot$EHB.p<0.05,"bias"] <- "EHB"
reproductive.plot[reproductive.plot$AHB.p<0.05,"bias"] <- "AHB"
biaslist <- data.frame(matrix(ncol=2,nrow=0))
names(biaslist) <- c("gene","bias")
genelist <- unique(reproductive.plot$gene)
for(i in 1:length(genelist)){
  tmp <- unique(reproductive.plot[reproductive.plot$gene==genelist[i],"bias"])
  if(length(tmp)>1){
    if(length(tmp)==2){
      if(any(tmp%in%"NA")){
        bias <- tmp[!tmp%in%"NA"]
      }
    }else{
      bias <- "NA"
    }
  }else{bias <- tmp}
  biaslist <- rbind(biaslist,data.frame(gene=genelist[[i]], bias=bias))
}
reproductive.plot <- reproductive.plot %>% 
  left_join(biaslist, by = c('gene' = 'gene')) 
names(reproductive.plot) <- c("p1","p2","pat.p","mat.p",
                         "AHB.p","EHB.p","SNP_gene","gene","xbias","bias")

bias.plot <- list()
for(i in 1:length(row.names(reproductive.plot))){
  if(reproductive.plot[i,"xbias"]==reproductive.plot[i,"bias"]){
    bias.plot[i] <- reproductive.plot[i,"xbias"]
  }else{
    bias.plot[i] <- "NA"
  }
}
reproductive.plot$bias.plot <- unlist(bias.plot)
reproductive.plot <- rbind(reproductive.plot[reproductive.plot$bias.plot%in%c("NA"),],
                      reproductive.plot[reproductive.plot$bias.plot%in%c("mat", "AHB", "EHB", "pat"),])
reproductive.plot$bias.plot <- factor(reproductive.plot$bias.plot,
                                 levels = c("NA","mat", "AHB", "EHB", "pat"))

Generate plot for reproductive F1s

g2 <- ggplot(reproductive.plot, aes(x=p1, y=p2,
                               color=bias.plot,alpha=0.8)) + 
  geom_point() + theme_classic() +
  xlab(expression(paste("% A allele in ",E[mother],
                        " x ",A[father],sep=""))) +
  ylab(expression(paste("% A allele in ",A[mother],
                        " x ",E[father],sep=""))) +
  ggtitle("reproductive workers") +
  theme(text = element_text(size=20),
        plot.title = element_text(hjust = 0.5)) +
  scale_color_manual(breaks = levels(reproductive.plot$bias)[-c(1)],
                     values=c("grey", magma(20)[c(2)], 
                              magma(20)[c(8)], magma(20)[c(12)], 
                              magma(20)[c(16)]))+
  guides(alpha=F, color=F)

g2

Final plot


Make center tri-plot

gmid.df <- data.frame(Sterile=c(length(unique(sterile.plot[sterile.plot$bias=="mat","gene"])),
                                length(unique(sterile.plot[sterile.plot$bias=="AHB","gene"])),
                                length(unique(sterile.plot[sterile.plot$bias=="EHB","gene"])),
                                length(unique(sterile.plot[sterile.plot$bias=="pat","gene"]))),
                      Bias=c("mat","AHB","EHB","pat"),
                      Reproductive=c(length(unique(reproductive.plot[reproductive.plot$bias=="mat","gene"])),
                                     length(unique(reproductive.plot[reproductive.plot$bias=="AHB","gene"])),
                                     length(unique(reproductive.plot[reproductive.plot$bias=="EHB","gene"])),
                                     length(unique(reproductive.plot[reproductive.plot$bias=="pat","gene"]))))

sterile_IDs <- unique(as.character(map(strsplit(row.names(sterile_counts), split = ":"), 2)))
reproductive_IDs <- unique(as.character(map(strsplit(row.names(reproductive_counts), split = ":"), 2)))

## Test if # of sterile biased genes is different from reproductive biased genes
mat.test <- chisq.test(data.frame(Success=c(gmid.df[1,1],gmid.df[1,3]),
                                  Failure=c(length(sterile_IDs)-gmid.df[1,1],
                                            length(reproductive_IDs)-gmid.df[1,3]),
                                  row.names=c("Sterile","Reproductive")))$p.value
AHB.test <- chisq.test(data.frame(Success=c(gmid.df[2,1],gmid.df[2,3]),
                                  Failure=c(length(sterile_IDs)-gmid.df[2,1],
                                            length(reproductive_IDs)-gmid.df[2,3]),
                                  row.names=c("Sterile","Reproductive")))$p.value
EHB.test <- chisq.test(data.frame(Success=c(gmid.df[3,1],gmid.df[3,3]),
                                  Failure=c(length(sterile_IDs)-gmid.df[3,1],
                                            length(reproductive_IDs)-gmid.df[3,3]),
                                  row.names=c("Sterile","Reproductive")))$p.value
pat.test <- chisq.test(data.frame(Success=c(gmid.df[4,1],gmid.df[4,3]),
                                  Failure=c(length(sterile_IDs)-gmid.df[4,1],
                                            length(reproductive_IDs)-gmid.df[4,3]),
                                  row.names=c("Sterile","Reproductive")))$p.value
## Build table
gmid.df$`.` <- c(mat.test,AHB.test,EHB.test,pat.test)

gmid.df <- gmid.df[,c(4,1,2,3)]
nsrows <- row.names(gmid.df[gmid.df$`.`>0.05,])
gmid.df$`.` <- formatC(gmid.df$`.`, format = "e", digits = 2)
gmid.df[nsrows,"."] <- "(ns)"

cols <- matrix("black", nrow(gmid.df), ncol(gmid.df))
cols[1,3] <- magma(20)[c(2)]
cols[2,3] <- magma(20)[c(8)]
cols[3,3] <- magma(20)[c(12)]
cols[4,3] <- magma(20)[c(16)]

ccols <- matrix("white", nrow(gmid.df), ncol(gmid.df))
ccols[1,3] <- "#e4d8d1"
ccols[2,3] <- "#e4d8d1"
ccols[3,3] <- "#e4d8d1"
ccols[4,3] <- "#e4d8d1"
ccols[1,2] <- "#f4efea"
ccols[2,2] <- "#f4efea"
ccols[3,2] <- "#f4efea"
ccols[4,2] <- "#f4efea"
ccols[1,4] <- "#f4efea"
ccols[2,4] <- "#f4efea"
ccols[3,4] <- "#f4efea"
ccols[4,4] <- "#f4efea"

cfonts <- matrix("plain", nrow(gmid.df), ncol(gmid.df))
cfonts[1,3] <- "bold"
cfonts[2,3] <- "bold"
cfonts[3,3] <- "bold"
cfonts[4,3] <- "bold"

tt <- ttheme_default(core=list(fg_params = list(col = cols, 
                                                cex = 1,
                                                fontface = cfonts),
                               bg_params = list(col=NA,
                                                fill = ccols),
                               padding.h=unit(2, "mm")),
                     rowhead=list(bg_params = list(col=NA)),
                     colhead=list(bg_params = list(fill = c("white",
                                                            "#f4efea",
                                                            "#e4d8d1",
                                                            "#f4efea")),
                                  fg_params = list(rot=90,
                                                   cex = 1,
                                                   col = c("white",
                                                           "black",
                                                           "black",
                                                           "black"))))

gmid <- tableGrob(gmid.df, rows = NULL, theme=tt)

plot(gmid)

Join plots to generate the final plot

fig1 <- arrangeGrob(g1, gmid, g2, widths=c(5,2.5,5))
ggsave(file="fig1.png", plot=fig1, width=18, height=6)